{
 "cells": [
  {
   "attachments": {},
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/pinecone-io/examples/blob/master/integrations/openai/beyond_search_webinar/01_index-init.ipynb) [![Open nbviewer](https://raw.githubusercontent.com/pinecone-io/examples/master/assets/nbviewer-shield.svg)](https://nbviewer.org/github/pinecone-io/examples/blob/master/integrations/openai/beyond_search_webinar/01_index-init.ipynb)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Index Init\n",
    "\n",
    "We use this notebook to push context data to Pinecone, before running this notebook the `data/curie_embeddings.parquet` file must have been created."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>docs</th>\n",
       "      <th>category</th>\n",
       "      <th>thread</th>\n",
       "      <th>href</th>\n",
       "      <th>question</th>\n",
       "      <th>context</th>\n",
       "      <th>marked</th>\n",
       "      <th>text</th>\n",
       "      <th>n_tokens</th>\n",
       "      <th>embeddings</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>huggingface</td>\n",
       "      <td>Beginners</td>\n",
       "      <td>Can’t download (some) models although they are...</td>\n",
       "      <td>https://discuss.huggingface.co/t/cant-download...</td>\n",
       "      <td>Can’t download (some) models to pytorch, altho...</td>\n",
       "      <td>Looking at umarayub/t5-small-finetuned-xsum at...</td>\n",
       "      <td>0</td>\n",
       "      <td>Topic: huggingface - Beginners; Question: Can’...</td>\n",
       "      <td>550</td>\n",
       "      <td>[0.004923707339912653, -0.016777075827121735, ...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>huggingface</td>\n",
       "      <td>Beginners</td>\n",
       "      <td>Trainer.push_to_hub is taking lot of time, is ...</td>\n",
       "      <td>https://discuss.huggingface.co/t/trainer-push-...</td>\n",
       "      <td>Hi, I’m trying to push my model to HF hub via ...</td>\n",
       "      <td>@sgugger  can you please help me out with this...</td>\n",
       "      <td>0</td>\n",
       "      <td>Topic: huggingface - Beginners; Question: Trai...</td>\n",
       "      <td>204</td>\n",
       "      <td>[0.0020476023200899363, -0.0010360622545704246...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>huggingface</td>\n",
       "      <td>Beginners</td>\n",
       "      <td>SSLCertVerificationError when loading a model</td>\n",
       "      <td>https://discuss.huggingface.co/t/sslcertverifi...</td>\n",
       "      <td>I am exploring potential opportunities of usin...</td>\n",
       "      <td>I’m also getting the same error. Please let me...</td>\n",
       "      <td>0</td>\n",
       "      <td>Topic: huggingface - Beginners; Question: SSLC...</td>\n",
       "      <td>494</td>\n",
       "      <td>[0.002923486055806279, 0.007949204184114933, 0...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>huggingface</td>\n",
       "      <td>Beginners</td>\n",
       "      <td>How to use embeddings to compute similarity?</td>\n",
       "      <td>https://discuss.huggingface.co/t/how-to-use-em...</td>\n",
       "      <td>Hi, I would like to compute sentence similarit...</td>\n",
       "      <td>With transformers, the feature-extraction pipe...</td>\n",
       "      <td>0</td>\n",
       "      <td>Topic: huggingface - Beginners; Question: How ...</td>\n",
       "      <td>351</td>\n",
       "      <td>[-0.011044162325561047, 0.0021849798504263163,...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>huggingface</td>\n",
       "      <td>Beginners</td>\n",
       "      <td>How to use additional input features for NER?</td>\n",
       "      <td>https://discuss.huggingface.co/t/how-to-use-ad...</td>\n",
       "      <td>Hello,\\nI’ve been following the documentation ...</td>\n",
       "      <td>mhl:\\n\\ne.g [“Arizona_NNP”, “Ice_NNP”, “Tea_NN...</td>\n",
       "      <td>0</td>\n",
       "      <td>Topic: huggingface - Beginners; Question: How ...</td>\n",
       "      <td>1718</td>\n",
       "      <td>[0.002879042411223054, -0.004730842541903257, ...</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "          docs   category                                             thread  \\\n",
       "0  huggingface  Beginners  Can’t download (some) models although they are...   \n",
       "1  huggingface  Beginners  Trainer.push_to_hub is taking lot of time, is ...   \n",
       "2  huggingface  Beginners      SSLCertVerificationError when loading a model   \n",
       "3  huggingface  Beginners       How to use embeddings to compute similarity?   \n",
       "4  huggingface  Beginners      How to use additional input features for NER?   \n",
       "\n",
       "                                                href  \\\n",
       "0  https://discuss.huggingface.co/t/cant-download...   \n",
       "1  https://discuss.huggingface.co/t/trainer-push-...   \n",
       "2  https://discuss.huggingface.co/t/sslcertverifi...   \n",
       "3  https://discuss.huggingface.co/t/how-to-use-em...   \n",
       "4  https://discuss.huggingface.co/t/how-to-use-ad...   \n",
       "\n",
       "                                            question  \\\n",
       "0  Can’t download (some) models to pytorch, altho...   \n",
       "1  Hi, I’m trying to push my model to HF hub via ...   \n",
       "2  I am exploring potential opportunities of usin...   \n",
       "3  Hi, I would like to compute sentence similarit...   \n",
       "4  Hello,\\nI’ve been following the documentation ...   \n",
       "\n",
       "                                             context  marked  \\\n",
       "0  Looking at umarayub/t5-small-finetuned-xsum at...       0   \n",
       "1  @sgugger  can you please help me out with this...       0   \n",
       "2  I’m also getting the same error. Please let me...       0   \n",
       "3  With transformers, the feature-extraction pipe...       0   \n",
       "4  mhl:\\n\\ne.g [“Arizona_NNP”, “Ice_NNP”, “Tea_NN...       0   \n",
       "\n",
       "                                                text  n_tokens  \\\n",
       "0  Topic: huggingface - Beginners; Question: Can’...       550   \n",
       "1  Topic: huggingface - Beginners; Question: Trai...       204   \n",
       "2  Topic: huggingface - Beginners; Question: SSLC...       494   \n",
       "3  Topic: huggingface - Beginners; Question: How ...       351   \n",
       "4  Topic: huggingface - Beginners; Question: How ...      1718   \n",
       "\n",
       "                                          embeddings  \n",
       "0  [0.004923707339912653, -0.016777075827121735, ...  \n",
       "1  [0.0020476023200899363, -0.0010360622545704246...  \n",
       "2  [0.002923486055806279, 0.007949204184114933, 0...  \n",
       "3  [-0.011044162325561047, 0.0021849798504263163,...  \n",
       "4  [0.002879042411223054, -0.004730842541903257, ...  "
      ]
     },
     "execution_count": 2,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "import pandas as pd\n",
    "\n",
    "df = pd.read_parquet('data/curie_embeddings.parquet')\n",
    "df.head()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "The max size limit for metadata in Pinecone is 5KB, let's check the *text* field to see if it has items greater than this."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "1047 / 5957 records are too big\n"
     ]
    }
   ],
   "source": [
    "from sys import getsizeof\n",
    "\n",
    "too_big = []\n",
    "\n",
    "for text in df['text'].tolist():\n",
    "    if getsizeof(text) > 5000:\n",
    "        too_big.append((text, getsizeof(text)))\n",
    "\n",
    "print(f\"{len(too_big)} / {len(df)} records are too big\")"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Unfortunately there are plenty, so we will make sure to include some mapping from retrieved IDs (from Pinecone) to the original text. We can do this by assigning a unique ID to each text item and storing it with the Streamlit app."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>docs</th>\n",
       "      <th>category</th>\n",
       "      <th>thread</th>\n",
       "      <th>href</th>\n",
       "      <th>question</th>\n",
       "      <th>context</th>\n",
       "      <th>marked</th>\n",
       "      <th>text</th>\n",
       "      <th>n_tokens</th>\n",
       "      <th>embeddings</th>\n",
       "      <th>id</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>huggingface</td>\n",
       "      <td>Beginners</td>\n",
       "      <td>Can’t download (some) models although they are...</td>\n",
       "      <td>https://discuss.huggingface.co/t/cant-download...</td>\n",
       "      <td>Can’t download (some) models to pytorch, altho...</td>\n",
       "      <td>Looking at umarayub/t5-small-finetuned-xsum at...</td>\n",
       "      <td>0</td>\n",
       "      <td>Topic: huggingface - Beginners; Question: Can’...</td>\n",
       "      <td>550</td>\n",
       "      <td>[0.004923707339912653, -0.016777075827121735, ...</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>huggingface</td>\n",
       "      <td>Beginners</td>\n",
       "      <td>Trainer.push_to_hub is taking lot of time, is ...</td>\n",
       "      <td>https://discuss.huggingface.co/t/trainer-push-...</td>\n",
       "      <td>Hi, I’m trying to push my model to HF hub via ...</td>\n",
       "      <td>@sgugger  can you please help me out with this...</td>\n",
       "      <td>0</td>\n",
       "      <td>Topic: huggingface - Beginners; Question: Trai...</td>\n",
       "      <td>204</td>\n",
       "      <td>[0.0020476023200899363, -0.0010360622545704246...</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>huggingface</td>\n",
       "      <td>Beginners</td>\n",
       "      <td>SSLCertVerificationError when loading a model</td>\n",
       "      <td>https://discuss.huggingface.co/t/sslcertverifi...</td>\n",
       "      <td>I am exploring potential opportunities of usin...</td>\n",
       "      <td>I’m also getting the same error. Please let me...</td>\n",
       "      <td>0</td>\n",
       "      <td>Topic: huggingface - Beginners; Question: SSLC...</td>\n",
       "      <td>494</td>\n",
       "      <td>[0.002923486055806279, 0.007949204184114933, 0...</td>\n",
       "      <td>2</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>huggingface</td>\n",
       "      <td>Beginners</td>\n",
       "      <td>How to use embeddings to compute similarity?</td>\n",
       "      <td>https://discuss.huggingface.co/t/how-to-use-em...</td>\n",
       "      <td>Hi, I would like to compute sentence similarit...</td>\n",
       "      <td>With transformers, the feature-extraction pipe...</td>\n",
       "      <td>0</td>\n",
       "      <td>Topic: huggingface - Beginners; Question: How ...</td>\n",
       "      <td>351</td>\n",
       "      <td>[-0.011044162325561047, 0.0021849798504263163,...</td>\n",
       "      <td>3</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>huggingface</td>\n",
       "      <td>Beginners</td>\n",
       "      <td>How to use additional input features for NER?</td>\n",
       "      <td>https://discuss.huggingface.co/t/how-to-use-ad...</td>\n",
       "      <td>Hello,\\nI’ve been following the documentation ...</td>\n",
       "      <td>mhl:\\n\\ne.g [“Arizona_NNP”, “Ice_NNP”, “Tea_NN...</td>\n",
       "      <td>0</td>\n",
       "      <td>Topic: huggingface - Beginners; Question: How ...</td>\n",
       "      <td>1718</td>\n",
       "      <td>[0.002879042411223054, -0.004730842541903257, ...</td>\n",
       "      <td>4</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "          docs   category                                             thread  \\\n",
       "0  huggingface  Beginners  Can’t download (some) models although they are...   \n",
       "1  huggingface  Beginners  Trainer.push_to_hub is taking lot of time, is ...   \n",
       "2  huggingface  Beginners      SSLCertVerificationError when loading a model   \n",
       "3  huggingface  Beginners       How to use embeddings to compute similarity?   \n",
       "4  huggingface  Beginners      How to use additional input features for NER?   \n",
       "\n",
       "                                                href  \\\n",
       "0  https://discuss.huggingface.co/t/cant-download...   \n",
       "1  https://discuss.huggingface.co/t/trainer-push-...   \n",
       "2  https://discuss.huggingface.co/t/sslcertverifi...   \n",
       "3  https://discuss.huggingface.co/t/how-to-use-em...   \n",
       "4  https://discuss.huggingface.co/t/how-to-use-ad...   \n",
       "\n",
       "                                            question  \\\n",
       "0  Can’t download (some) models to pytorch, altho...   \n",
       "1  Hi, I’m trying to push my model to HF hub via ...   \n",
       "2  I am exploring potential opportunities of usin...   \n",
       "3  Hi, I would like to compute sentence similarit...   \n",
       "4  Hello,\\nI’ve been following the documentation ...   \n",
       "\n",
       "                                             context  marked  \\\n",
       "0  Looking at umarayub/t5-small-finetuned-xsum at...       0   \n",
       "1  @sgugger  can you please help me out with this...       0   \n",
       "2  I’m also getting the same error. Please let me...       0   \n",
       "3  With transformers, the feature-extraction pipe...       0   \n",
       "4  mhl:\\n\\ne.g [“Arizona_NNP”, “Ice_NNP”, “Tea_NN...       0   \n",
       "\n",
       "                                                text  n_tokens  \\\n",
       "0  Topic: huggingface - Beginners; Question: Can’...       550   \n",
       "1  Topic: huggingface - Beginners; Question: Trai...       204   \n",
       "2  Topic: huggingface - Beginners; Question: SSLC...       494   \n",
       "3  Topic: huggingface - Beginners; Question: How ...       351   \n",
       "4  Topic: huggingface - Beginners; Question: How ...      1718   \n",
       "\n",
       "                                          embeddings id  \n",
       "0  [0.004923707339912653, -0.016777075827121735, ...  0  \n",
       "1  [0.0020476023200899363, -0.0010360622545704246...  1  \n",
       "2  [0.002923486055806279, 0.007949204184114933, 0...  2  \n",
       "3  [-0.011044162325561047, 0.0021849798504263163,...  3  \n",
       "4  [0.002879042411223054, -0.004730842541903257, ...  4  "
      ]
     },
     "execution_count": 6,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df['id'] = [str(i) for i in range(len(df))]\n",
    "df.head()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Now let's populate the Pinecone index."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 24,
   "metadata": {},
   "outputs": [],
   "source": [
    "from pinecone import Pinecone\n",
    "\n",
    "pinecone.init(\n",
    "    api_key='PINECONE_API_KEY',  # app.pinecone.io\n",
    "    environment=\"YOUR_ENV\"  # find next to API key in console\n",
    ")\n",
    "\n",
    "index_name = 'beyond-search-openai'\n",
    "\n",
    "if not index_name in pinecone.list_indexes().names():\n",
    "    pinecone.create_index(\n",
    "        index_name, dimension=len(df['embeddings'].tolist()[0]),\n",
    "        metric='cosine'\n",
    "    )\n",
    "\n",
    "index = pinecone.Index(index_name)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "We will populate in batches, including any relevant metadata like *docs*, *category*, *thread*, and *href*."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 25,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "100%|██████████| 187/187 [08:36<00:00,  2.76s/it]\n"
     ]
    }
   ],
   "source": [
    "from tqdm.auto import tqdm\n",
    "\n",
    "batch_size = 32\n",
    "\n",
    "for i in tqdm(range(0, len(df), batch_size)):\n",
    "    i_end = min(i+batch_size, len(df))\n",
    "    df_slice = df.iloc[i:i_end]\n",
    "    to_upsert = [\n",
    "        (\n",
    "            row['id'],\n",
    "            row['embeddings'].tolist(),\n",
    "            {\n",
    "                'docs': row['docs'],\n",
    "                'category': row['category'],\n",
    "                'thread': row['thread'],\n",
    "                'href': row['href'],\n",
    "                'n_tokens': row['n_tokens']\n",
    "            }\n",
    "        ) for _, row in df_slice.iterrows()\n",
    "    ]\n",
    "    index.upsert(vectors=to_upsert)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "We'll save another dataset to file containing just ID -> text mappings."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 20,
   "metadata": {},
   "outputs": [],
   "source": [
    "mappings = {row['id']: row['text'] for _, row in df[['id', 'text']].iterrows()}"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 22,
   "metadata": {},
   "outputs": [],
   "source": [
    "import json\n",
    "\n",
    "with open('data/mapping.json', 'w') as fp:\n",
    "    json.dump(mappings, fp)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "---"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.10.9"
  },
  "orig_nbformat": 4,
  "vscode": {
   "interpreter": {
    "hash": "b0fa6594d8f4cbf19f97940f81e996739fb7646882a419484c72d19e05852a7e"
   }
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
